package cn.yuyueq.esjd.utils;

import cn.yuyueq.esjd.pojo.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * <p>
 *
 * </p>
 *
 * @author wenxin.du
 * @since 2021/9/22
 */
@Component
public class HtmlParseUtil {

    //public static void main(String[] args) throws IOException {
    //    new HtmlParseUtil().parseJD("Java编程思想").forEach(System.out::println);
    //}

    public List<Content> parseJD(String keywords) throws IOException {
        //获取请求 https://search.jd.com/Search?keyword=java&enc=utf-8
        //前提需要连网
        String url = "https://search.jd.com/Search?keyword=" + keywords + "&enc=utf-8";
        //解析网页 (Jsoup返回Document就是浏览器Document对象)
        Document document = Jsoup.parse(new URL(url), 30000);
        //所有在js中能使用的方法,这里都能用
        Element element = document.getElementById("J_goodsList");
        //获取所有li元素
        Elements elements = element.getElementsByTag("li");

        ArrayList<Content> goodList = new ArrayList<>();

        //通过元素中的内容,这里el就是每一个li标签了
        for (Element el : elements) {
            //加if判断是为了 过滤空标签
            if (el.attr("class").equalsIgnoreCase("gl-item")) {
                //关于这种图片特别多的网页,所有的图片都是延迟加载的
                //在jd搜索后f12可以看到存放在data-lazy-img中
                String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
                String price = el.getElementsByClass("p-price").eq(0).text();
                String title = el.getElementsByClass("p-name").eq(0).text();

                Content content = new Content();
                content.setImg(img);
                content.setPrice(price);
                content.setTitle(title);
                goodList.add(content);
            }

        }
        return goodList;
    }

}
